
all:

OPENFST_CXXFLAGS =
OPENFST_LDLIBS =


include ../aslp.mk
include ../kaldi.mk

LDFLAGS += $(CUDA_LDFLAGS)
LDLIBS += $(CUDA_LDLIBS)

TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
            cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \
			cu-sparse-matrix-test cu-device-test


OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
           cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o \
           cu-sparse-matrix.o cu-allocator.o
ifeq ($(CUDA), true)
  OBJFILES += cu-kernels.o cu-randkernels.o cu-nnet-mpi-sync.o
endif

LIBNAME = aslp-cudamatrix

all:  $(LIBFILE)


ifeq ($(CUDA), true)
  #Default compute capability architectures we compile with
  CUDA_ARCH=-gencode arch=compute_20,code=sm_20
  #Get the CUDA Toolkit version (remove decimal point char)
  CUDA_VERSION=$(shell $(CUDATKDIR)/bin/nvcc -V | grep release | sed -e 's|.*release ||' -e 's|,.*||' -e 's|\.||')
  #For toolkit 4.2 or newer, add the compute capability 3.0
  CUDA_VER_GT_4_2 := $(shell [ $(CUDA_VERSION) -ge 42 ] && echo true)
  ifeq ($(CUDA_VER_GT_4_2), true)
    CUDA_ARCH += -gencode arch=compute_30,code=sm_30
  endif
  #For toolkit 5.0 or newer, add the compute capability 3.5
  CUDA_VER_GT_5_0 := $(shell [ $(CUDA_VERSION) -ge 50 ] && echo true)
  ifeq ($(CUDA_VER_GT_5_0), true)
    CUDA_ARCH += -gencode arch=compute_35,code=sm_35
  endif
  #For toolkit 6.0 or newer, add the compute capability 5.0
  CUDA_VER_GT_6_0 := $(shell [ $(CUDA_VERSION) -ge 60 ] && echo true)
  ifeq ($(CUDA_VER_GT_6_0), true)
    CUDA_ARCH += -gencode arch=compute_50,code=sm_50
  endif
  #For toolkit older than 6.5, add the compute capability 1.0
  CUDA_VER_GT_6_5 := $(shell [ $(CUDA_VERSION) -ge 65 ] && echo true)
  ifneq ($(CUDA_VER_GT_6_5), true)
    CUDA_ARCH += -gencode arch=compute_13,code=sm_13 \
                 -gencode arch=compute_10,code=sm_10
  endif
  #For toolkit 7.0 or newer, add the compute capability 5.2
  CUDA_VER_GT_7_0 := $(shell [ $(CUDA_VERSION) -ge 70 ] && echo true)
  ifeq ($(CUDA_VER_GT_7_0), true)
    CUDA_ARCH += -gencode arch=compute_52,code=sm_52 \
		 -gencode arch=compute_53,code=sm_53
  endif 
  #For toolkit 8.0 or newer, add the compute capability 6.0
  CUDA_VER_GT_8_0 := $(shell [ $(CUDA_VERSION) -ge 80 ] && echo true)
  ifeq ($(CUDA_VER_GT_8_0), true)
    CUDA_ARCH += -gencode arch=compute_60,code=sm_60 \
		 -gencode arch=compute_61,code=sm_61 \
		 -gencode arch=compute_62,code=sm_62
  endif

endif


#implicit rule for kernel compilation
%.o : %.cu
	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../

ADDLIBS = ../matrix/kaldi-matrix.a \
          ../base/kaldi-base.a \
          ../util/kaldi-util.a 

include ../makefiles/default_rules.mk

